In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fbprophet import Prophet

%matplotlib inline
In [2]:
df = pd.read_csv('avocado.csv')
df.head(20)
Out[2]:
Unnamed: 0 Date AveragePrice Total Volume 4046 4225 4770 Total Bags Small Bags Large Bags XLarge Bags type year region
0 0 2015-12-27 1.33 64236.62 1036.74 54454.85 48.16 8696.87 8603.62 93.25 0.0 conventional 2015 Albany
1 1 2015-12-20 1.35 54876.98 674.28 44638.81 58.33 9505.56 9408.07 97.49 0.0 conventional 2015 Albany
2 2 2015-12-13 0.93 118220.22 794.70 109149.67 130.50 8145.35 8042.21 103.14 0.0 conventional 2015 Albany
3 3 2015-12-06 1.08 78992.15 1132.00 71976.41 72.58 5811.16 5677.40 133.76 0.0 conventional 2015 Albany
4 4 2015-11-29 1.28 51039.60 941.48 43838.39 75.78 6183.95 5986.26 197.69 0.0 conventional 2015 Albany
5 5 2015-11-22 1.26 55979.78 1184.27 48067.99 43.61 6683.91 6556.47 127.44 0.0 conventional 2015 Albany
6 6 2015-11-15 0.99 83453.76 1368.92 73672.72 93.26 8318.86 8196.81 122.05 0.0 conventional 2015 Albany
7 7 2015-11-08 0.98 109428.33 703.75 101815.36 80.00 6829.22 6266.85 562.37 0.0 conventional 2015 Albany
8 8 2015-11-01 1.02 99811.42 1022.15 87315.57 85.34 11388.36 11104.53 283.83 0.0 conventional 2015 Albany
9 9 2015-10-25 1.07 74338.76 842.40 64757.44 113.00 8625.92 8061.47 564.45 0.0 conventional 2015 Albany
10 10 2015-10-18 1.12 84843.44 924.86 75595.85 117.07 8205.66 7877.86 327.80 0.0 conventional 2015 Albany
11 11 2015-10-11 1.28 64489.17 1582.03 52677.92 105.32 10123.90 9866.27 257.63 0.0 conventional 2015 Albany
12 12 2015-10-04 1.31 61007.10 2268.32 49880.67 101.36 8756.75 8379.98 376.77 0.0 conventional 2015 Albany
13 13 2015-09-27 0.99 106803.39 1204.88 99409.21 154.84 6034.46 5888.87 145.59 0.0 conventional 2015 Albany
14 14 2015-09-20 1.33 69759.01 1028.03 59313.12 150.50 9267.36 8489.10 778.26 0.0 conventional 2015 Albany
15 15 2015-09-13 1.28 76111.27 985.73 65696.86 142.00 9286.68 8665.19 621.49 0.0 conventional 2015 Albany
16 16 2015-09-06 1.11 99172.96 879.45 90062.62 240.79 7990.10 7762.87 227.23 0.0 conventional 2015 Albany
17 17 2015-08-30 1.07 105693.84 689.01 94362.67 335.43 10306.73 10218.93 87.80 0.0 conventional 2015 Albany
18 18 2015-08-23 1.34 79992.09 733.16 67933.79 444.78 10880.36 10745.79 134.57 0.0 conventional 2015 Albany
19 19 2015-08-16 1.33 80043.78 539.65 68666.01 394.90 10443.22 10297.68 145.54 0.0 conventional 2015 Albany
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18249 entries, 0 to 18248
Data columns (total 14 columns):
Unnamed: 0      18249 non-null int64
Date            18249 non-null object
AveragePrice    18249 non-null float64
Total Volume    18249 non-null float64
4046            18249 non-null float64
4225            18249 non-null float64
4770            18249 non-null float64
Total Bags      18249 non-null float64
Small Bags      18249 non-null float64
Large Bags      18249 non-null float64
XLarge Bags     18249 non-null float64
type            18249 non-null object
year            18249 non-null int64
region          18249 non-null object
dtypes: float64(9), int64(2), object(3)
memory usage: 1.9+ MB
In [4]:
#Renaming numerical column names. Saving changes to original dataframe.
df.rename(index=str, columns={"4046": "Small Hass", "4225": "Large Hass", "4770": "Extra-Large Hass"}, inplace=True)
In [5]:
#Dropping Unnamed: 0 column
df.drop(['Unnamed: 0'], axis=1, inplace=True)
In [6]:
#Making data column readable by Pandas
df['Date'] = pd.to_datetime(df['Date'])
In [7]:
df.head()
Out[7]:
Date AveragePrice Total Volume Small Hass Large Hass Extra-Large Hass Total Bags Small Bags Large Bags XLarge Bags type year region
0 2015-12-27 1.33 64236.62 1036.74 54454.85 48.16 8696.87 8603.62 93.25 0.0 conventional 2015 Albany
1 2015-12-20 1.35 54876.98 674.28 44638.81 58.33 9505.56 9408.07 97.49 0.0 conventional 2015 Albany
2 2015-12-13 0.93 118220.22 794.70 109149.67 130.50 8145.35 8042.21 103.14 0.0 conventional 2015 Albany
3 2015-12-06 1.08 78992.15 1132.00 71976.41 72.58 5811.16 5677.40 133.76 0.0 conventional 2015 Albany
4 2015-11-29 1.28 51039.60 941.48 43838.39 75.78 6183.95 5986.26 197.69 0.0 conventional 2015 Albany

Analyzing categorical data to fully see the range of this set

In [8]:
df['region'].value_counts()
Out[8]:
MiamiFtLauderdale      338
LosAngeles             338
Tampa                  338
West                   338
HartfordSpringfield    338
Portland               338
CincinnatiDayton       338
SouthCarolina          338
Boise                  338
Midsouth               338
Albany                 338
Detroit                338
Pittsburgh             338
Sacramento             338
California             338
Atlanta                338
Columbus               338
Seattle                338
NorthernNewEngland     338
LasVegas               338
SouthCentral           338
Indianapolis           338
NewYork                338
Chicago                338
StLouis                338
Houston                338
Northeast              338
RaleighGreensboro      338
Orlando                338
Philadelphia           338
NewOrleansMobile       338
Plains                 338
Charlotte              338
HarrisburgScranton     338
PhoenixTucson          338
Boston                 338
Southeast              338
DallasFtWorth          338
GreatLakes             338
Syracuse               338
Denver                 338
SanFrancisco           338
Roanoke                338
TotalUS                338
Nashville              338
GrandRapids            338
Louisville             338
SanDiego               338
Spokane                338
RichmondNorfolk        338
Jacksonville           338
BaltimoreWashington    338
BuffaloRochester       338
WestTexNewMexico       335
Name: region, dtype: int64
In [9]:
df['region'].nunique()
Out[9]:
54
In [10]:
df['year'].value_counts()
Out[10]:
2017    5722
2016    5616
2015    5615
2018    1296
Name: year, dtype: int64
In [11]:
df['type'].value_counts()
Out[11]:
conventional    9126
organic         9123
Name: type, dtype: int64

Here we see that there are two different types of avocadoes: Conventional and Organic. Lets separate these two types from eachother.

In [12]:
conventional_frame = df[df.type == 'conventional']
organic_frame = df[df.type == 'organic']
In [13]:
organic_frame.info()
<class 'pandas.core.frame.DataFrame'>
Index: 9123 entries, 9126 to 18248
Data columns (total 13 columns):
Date                9123 non-null datetime64[ns]
AveragePrice        9123 non-null float64
Total Volume        9123 non-null float64
Small Hass          9123 non-null float64
Large Hass          9123 non-null float64
Extra-Large Hass    9123 non-null float64
Total Bags          9123 non-null float64
Small Bags          9123 non-null float64
Large Bags          9123 non-null float64
XLarge Bags         9123 non-null float64
type                9123 non-null object
year                9123 non-null int64
region              9123 non-null object
dtypes: datetime64[ns](1), float64(9), int64(1), object(2)
memory usage: 997.8+ KB
In [14]:
organic_frame['AveragePrice'].mean()
Out[14]:
1.6539986846432082
In [15]:
conventional_frame['AveragePrice'].mean()
Out[15]:
1.1580396668858206
In [16]:
organic_frame[['AveragePrice','Small Hass']]
Out[16]:
AveragePrice Small Hass
9126 1.83 8.16
9127 1.89 30.24
9128 1.85 10.44
9129 1.84 90.29
9130 1.94 0.00
9131 1.94 13.84
9132 1.89 20.71
9133 1.88 20.08
9134 1.88 11.47
9135 1.83 49.27
9136 1.97 10.31
9137 1.90 28.65
9138 1.98 5.74
9139 1.98 13.79
9140 1.98 42.63
9141 1.99 13.86
9142 1.86 30.13
9143 1.88 17.27
9144 1.87 24.45
9145 2.00 24.56
9146 1.88 79.82
9147 2.00 17.66
9148 2.01 99.16
9149 2.08 50.86
9150 2.01 34.27
9151 2.04 50.69
9152 2.02 22.35
9153 2.09 17.59
9154 2.03 79.45
9155 1.93 25.60
... ... ...
18219 1.56 98465.26
18220 1.53 117922.52
18221 1.61 118616.17
18222 1.63 108705.28
18223 1.59 145680.62
18224 1.51 129541.43
18225 1.60 26996.28
18226 1.73 33437.98
18227 1.63 27566.25
18228 1.46 25990.60
18229 1.49 34200.18
18230 1.64 30149.00
18231 1.47 24732.55
18232 1.41 22474.66
18233 1.80 22918.40
18234 1.83 27049.44
18235 1.82 33869.12
18236 1.48 34734.97
18237 1.62 2325.30
18238 1.56 2055.35
18239 1.56 2162.67
18240 1.54 1832.24
18241 1.57 1974.26
18242 1.56 1892.05
18243 1.57 1924.28
18244 1.63 2046.96
18245 1.71 1191.70
18246 1.87 1191.92
18247 1.93 1527.63
18248 1.62 2894.77

9123 rows × 2 columns

Finding how many regions are recorded for each type of Avocado as well as how many entries per region

In [17]:
regions_organic = organic_frame.groupby(organic_frame.region)
print("Total regions for Organic avocado:", len(regions_organic))
print("-------------")
for name, group in regions_organic:
    print(name, " : ", len(group))
Total regions for Organic avocado: 54
-------------
Albany  :  169
Atlanta  :  169
BaltimoreWashington  :  169
Boise  :  169
Boston  :  169
BuffaloRochester  :  169
California  :  169
Charlotte  :  169
Chicago  :  169
CincinnatiDayton  :  169
Columbus  :  169
DallasFtWorth  :  169
Denver  :  169
Detroit  :  169
GrandRapids  :  169
GreatLakes  :  169
HarrisburgScranton  :  169
HartfordSpringfield  :  169
Houston  :  169
Indianapolis  :  169
Jacksonville  :  169
LasVegas  :  169
LosAngeles  :  169
Louisville  :  169
MiamiFtLauderdale  :  169
Midsouth  :  169
Nashville  :  169
NewOrleansMobile  :  169
NewYork  :  169
Northeast  :  169
NorthernNewEngland  :  169
Orlando  :  169
Philadelphia  :  169
PhoenixTucson  :  169
Pittsburgh  :  169
Plains  :  169
Portland  :  169
RaleighGreensboro  :  169
RichmondNorfolk  :  169
Roanoke  :  169
Sacramento  :  169
SanDiego  :  169
SanFrancisco  :  169
Seattle  :  169
SouthCarolina  :  169
SouthCentral  :  169
Southeast  :  169
Spokane  :  169
StLouis  :  169
Syracuse  :  169
Tampa  :  169
TotalUS  :  169
West  :  169
WestTexNewMexico  :  166
In [18]:
regions_conventional = conventional_frame.groupby(conventional_frame.region)
print("Total regions for Conventional avocado:", len(regions_conventional))
print("-------------")
for name, group in regions_conventional:
    print(name, " : ", len(group))
Total regions for Conventional avocado: 54
-------------
Albany  :  169
Atlanta  :  169
BaltimoreWashington  :  169
Boise  :  169
Boston  :  169
BuffaloRochester  :  169
California  :  169
Charlotte  :  169
Chicago  :  169
CincinnatiDayton  :  169
Columbus  :  169
DallasFtWorth  :  169
Denver  :  169
Detroit  :  169
GrandRapids  :  169
GreatLakes  :  169
HarrisburgScranton  :  169
HartfordSpringfield  :  169
Houston  :  169
Indianapolis  :  169
Jacksonville  :  169
LasVegas  :  169
LosAngeles  :  169
Louisville  :  169
MiamiFtLauderdale  :  169
Midsouth  :  169
Nashville  :  169
NewOrleansMobile  :  169
NewYork  :  169
Northeast  :  169
NorthernNewEngland  :  169
Orlando  :  169
Philadelphia  :  169
PhoenixTucson  :  169
Pittsburgh  :  169
Plains  :  169
Portland  :  169
RaleighGreensboro  :  169
RichmondNorfolk  :  169
Roanoke  :  169
Sacramento  :  169
SanDiego  :  169
SanFrancisco  :  169
Seattle  :  169
SouthCarolina  :  169
SouthCentral  :  169
Southeast  :  169
Spokane  :  169
StLouis  :  169
Syracuse  :  169
Tampa  :  169
TotalUS  :  169
West  :  169
WestTexNewMexico  :  169

Now we can make predictions based on a specific region we choose from either conventional or organic avocados. Lets start with organic and choose the "TotalUS" region.

In [19]:
date_price = regions_organic.get_group("TotalUS")[['Date', 'AveragePrice']].reset_index(drop=True)
In [20]:
#fig, ax = plt.subplots(figsize=(15,10))
date_price.plot(x='Date', y='AveragePrice', kind="line",figsize=(15,10))
plt.savefig('line_organic_avgp.png',bbox_inches='tight')
In [21]:
#Renaming the columns so they work with the fbprophet library

date_price = date_price.rename(columns={'Date':'ds', 'AveragePrice':'y'})
In [22]:
#Creating & fitting a model. All of the code here is further explained on prophet quick start page https://facebook.github.io/prophet/docs/quick_start.html

m = Prophet()
m.fit(date_price)
INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
C:\Users\Omar\Anaconda3\lib\site-packages\pystan\misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(np.asarray(v).dtype, float):
Out[22]:
<fbprophet.forecaster.Prophet at 0x2ceacc919e8>
In [23]:
# You can get a suitable dataframe that extends into the future a specified number of days using the helper method 
# Prophet.make_future_dataframe. By default it will also include the dates from the history, so we will see the 
# model fit as well.

future = m.make_future_dataframe(periods=365)
In [24]:
#This shows the last dates that will be forecasted up to
future.tail()
Out[24]:
ds
529 2019-03-21
530 2019-03-22
531 2019-03-23
532 2019-03-24
533 2019-03-25
In [25]:
# The predict method will assign each row in future a predicted value which it names yhat. If you pass in historical dates, 
# it will provide an in-sample fit. The forecast object here is a new dataframe that includes a column yhat with the forecast, 
# as well as columns for components and uncertainty intervals.

forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[25]:
ds yhat yhat_lower yhat_upper
529 2019-03-21 1.777671 1.603879 1.946662
530 2019-03-22 1.785573 1.620681 1.977702
531 2019-03-23 1.793586 1.615191 1.959374
532 2019-03-24 1.801608 1.621226 1.968077
533 2019-03-25 1.809540 1.636876 1.970276
In [26]:
# You can plot the forecast by calling the Prophet.plot method and passing in your forecast dataframe.
fig1 = m.plot(forecast)
plt.savefig('forecast_organic.png',bbox_inches='tight')

Here we can see how the individual components of the model affect the predictions

In [27]:
# If you want to see the forecast components, you can use the Prophet.plot_components method. By default you’ll see the trend, 
# yearly seasonality, and weekly seasonality of the time series. If you include holidays, you’ll see those here, too.

fig2 = m.plot_components(forecast)
plt.savefig('forecast_organic_components.png',bbox_inches='tight')

Now lets instead analyze conventional avocados in the TotalUS region.

In [28]:
date_price = regions_conventional.get_group("TotalUS")[['Date', 'AveragePrice']].reset_index(drop=True)
In [29]:
date_price.plot(x='Date', y='AveragePrice', kind="line",figsize=(15,10))
plt.savefig('line_conventional_avgp.png',bbox_inches='tight')
In [30]:
date_price = date_price.rename(columns={'Date':'ds', 'AveragePrice':'y'})
In [31]:
#Creating & fitting a model. All of the code here is further explained on prophet quick start page https://facebook.github.io/prophet/docs/quick_start.html

m = Prophet()
m.fit(date_price)
INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
C:\Users\Omar\Anaconda3\lib\site-packages\pystan\misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(np.asarray(v).dtype, float):
Out[31]:
<fbprophet.forecaster.Prophet at 0x2ceac8b2668>
In [32]:
# You can get a suitable dataframe that extends into the future a specified number of days using the helper method 
# Prophet.make_future_dataframe. By default it will also include the dates from the history, so we will see the 
# model fit as well.

future = m.make_future_dataframe(periods=365)
In [33]:
#This shows the last dates that will be forecasted up to
future.tail()
Out[33]:
ds
529 2019-03-21
530 2019-03-22
531 2019-03-23
532 2019-03-24
533 2019-03-25
In [34]:
# The predict method will assign each row in future a predicted value which it names yhat. If you pass in historical dates, 
# it will provide an in-sample fit. The forecast object here is a new dataframe that includes a column yhat with the forecast, 
# as well as columns for components and uncertainty intervals.

forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[34]:
ds yhat yhat_lower yhat_upper
529 2019-03-21 1.100679 0.971947 1.233577
530 2019-03-22 1.101912 0.981304 1.245725
531 2019-03-23 1.103614 0.980032 1.237514
532 2019-03-24 1.105771 0.976225 1.240768
533 2019-03-25 1.108353 0.973552 1.242185
In [35]:
# You can plot the forecast by calling the Prophet.plot method and passing in your forecast dataframe.
fig1 = m.plot(forecast)
plt.savefig('forecast_conventional.png',bbox_inches='tight')
In [36]:
# If you want to see the forecast components, you can use the Prophet.plot_components method. By default you’ll see the trend, 
# yearly seasonality, and weekly seasonality of the time series. If you include holidays, you’ll see those here, too.

fig2 = m.plot_components(forecast)
plt.savefig('forecast_conventional_components.png',bbox_inches='tight')